Aleksandra Dąbrowska - preparing the data, association rules
Alicja Gosiewska - creating plots, association rules
Szymon Górka - creating Markov chains, clustering
Zuzanna Kasprowicz - clustering, making the knit
But each of us had contribution in all parts of our project, so this division into tasks does not reflect our real input. Each of us has done the same amount of work.
Copernicus Science Centre (Polish: Centrum Nauki Kopernik) is a science museum in Warsaw. It contains over 450 interactive exhibits that enable visitors to single-handedly carry out experiments and discover the laws of science for themselves. The Centre is the largest institution of its type in Poland and one of the most advanced in Europe.
Our task was to analyze and understand data from CNK. The dataset contains information about visitors and their interaction time with exhibits.
We were trying to answer the following questions:
We used the following libraries:
library(lubridate)
library(plyr)
library(dplyr)
library(ggplot2)
library(arules)
library(arulesViz)
library(cluster)
library(stringdist)
library(gridExtra)
library(reshape2)
library(igraph)
library(network)
library(sna)
library(ndtv)
library(qgraph)
We started from preparing the dataset as following:
sciezka <- "C:/Alicja/R/2016_Statystyka_II_Projekt_1/faza3/nowe/proba/raport"
setwd(sciezka)
load("dane_czyste.rda")
head(dane_czyste)
## visitor station Time_max Time_min Time_station
## 1 795453 cnk07 2012-01-03 10:34:14 2012-01-03 10:32:51 83 secs
## 2 795453 cnk09 2012-01-03 10:37:44 2012-01-03 10:35:55 109 secs
## 3 795453 cnk16 2012-01-03 10:38:40 2012-01-03 10:37:49 51 secs
## 4 795453 cnk23 2012-01-03 10:46:21 2012-01-03 10:46:16 5 secs
## 5 795453 cnk61 2012-01-03 10:53:18 2012-01-03 10:53:00 18 secs
## 6 795453 cnk29a 2012-01-03 11:00:27 2012-01-03 10:54:52 335 secs
## Time_total Path_length isFirst isLast
## 1 883 10 cnk07 <NA>
## 2 883 10 <NA> <NA>
## 3 883 10 <NA> <NA>
## 4 883 10 <NA> <NA>
## 5 883 10 <NA> <NA>
## 6 883 10 <NA> <NA>
load("dane_sciezki.rda")
head(dane_sciezki)
## visitor Path
## 1 795453 070916236129a69572442a
## 2 795455 46a29a6610216972
## 3 795468 0307172112
## 4 795476 67071716121878a26612225694929a58b555742a38
## 5 795477 262446a563258b55
## 6 795486 02b0366122356327961
load("dane_klaster.rda")
head(dane_klaster)
## visitor Time_total Path_length
## 1 795453 883 10
## 2 795455 1458 7
## 3 795468 330 5
## 4 795476 1507 19
## 5 795477 1006 7
## 6 795486 851 9
load("smallLogs.rda")
dane<-smallLogs
library(lubridate)
dane["month"] <- NA
dane$month <- month(dane$date)
dane<-subset(dane,(dane$month=="1"))
#czyszczenie danych
dane<-subset(dane,(dane$visitor!="-1"))
library(plyr)
Time_min<-aggregate(date~visitor+station,dane, FUN=min)
Time_max<-aggregate(date~visitor+station,dane, FUN=max)
library(dplyr)
Time<-left_join(Time_max,Time_min,by=c("visitor","station"))
Time$Time_total <- Time[,3] - Time[,4]
Time<-Time[order(Time[,1],Time[,3]),]
Path_count<-count(Time,visitor)
dane_czas<-aggregate(Time_total~visitor,Time,FUN=sum)
dane_czas<-left_join(dane_czas,Path_count,by="visitor")
dane_czyste<-left_join(Time,dane_czas,by="visitor")
colnames(dane_czyste)[3]<-"Time_max"
colnames(dane_czyste)[4]<-"Time_min"
colnames(dane_czyste)[5]<-"Time_station"
colnames(dane_czyste)[6]<-"Time_total"
colnames(dane_czyste)[7]<-"Path_length"
dane_czyste<-subset(dane_czyste,(dane_czyste$Time_total<9000))
dane_czyste<-subset(dane_czyste,(dane_czyste$Path_length>4))
Temp<-aggregate(Time_min~visitor,dane_czyste,FUN=min)
colnames(Temp)[2] <- "temp_min"
dane_czyste<-left_join(dane_czyste,Temp,by = "visitor")
dane_czyste$isFirst <- ifelse(dane_czyste$Time_min==dane_czyste$temp_min,as.character(dane_czyste$station),NA)
Temp<-aggregate(Time_max~visitor,dane_czyste,FUN=max)
colnames(Temp)[2] <- "temp_max"
dane_czyste<-left_join(dane_czyste,Temp,by = "visitor")
dane_czyste$isLast <- ifelse(dane_czyste$Time_max==dane_czyste$temp_max,as.character(dane_czyste$station),NA)
rm(Temp)
dane_czyste<-dane_czyste[,-c(8,10)]
dane_sciezki<- aggregate(dane_czyste$station, dane_czyste["visitor"], paste, collapse="")
colnames(dane_sciezki)[2]<-"Path"
dane_sciezki$Path<-gsub('cnk','',dane_sciezki$Path)
dane_klaster <- distinct(dane_czyste[,c(1,6,7)])
In this chapter we will see some basic statistisc and interesting information about visitors and exhibits in CNK.
Average time spent in CNK (in seconds)
mean(as.numeric(dane_klaster$Time_total))
## [1] 1329.065
Median time spent in CNK (in seconds)
median(as.numeric(dane_klaster$Time_total))
## [1] 920
Average path length
mean(dane_klaster$Path_length)
## [1] 11.60206
Median path length
median(dane_klaster$Path_length)
## [1] 10
For each station we can see the number of visitors who visited it:
library(ggplot2)
library(dplyr)
nowa <- group_by(dane_czyste, station) %>%
summarise(n())
colnames(nowa)[2] <- "liczba"
nowa$station <- factor(nowa$station, levels = nowa$station[order(nowa$liczba)])
ggplot(nowa, aes(x = factor(station), y = liczba)) + geom_bar(stat = "identity")+ coord_flip()
From the above plot we discovered that the most frequent station is cnk16 and less frequent is cnk29a.
Let’s see plots which show us for each station quantity of guests who start and end their visit in CNK from this station.
library(ggplot2)
library(dplyr)
nowa2 <- group_by(dane_czyste, isFirst) %>%
summarise(n())
colnames(nowa2)[2] <- "liczba"
nowa2<-nowa2[-nrow(nowa2),]
nowa2$isFirst <- factor(nowa2$isFirst, levels = nowa2$isFirst[order(nowa2$liczba)])
ggplot(nowa2, aes(x = factor(isFirst), y = liczba)) + geom_bar(stat = "identity")+ coord_flip()+ggtitle("Frequency of the first station on the path")
nowa3 <- group_by(dane_czyste, isLast) %>%
summarise(n())
colnames(nowa3)[2] <- "liczba"
nowa3<-nowa3[-nrow(nowa3),]
nowa3$isLast <- factor(nowa3$isLast, levels = nowa3$isLast[order(nowa3$liczba)])
ggplot(nowa3, aes(x = factor(isLast), y = liczba)) + geom_bar(stat = "identity")+ coord_flip()+ggtitle("Frequency of the last station on the path")
From the above plots we can see that the first most frequent station was cnk02a and the last was cnk63a.
In this chapter we will discuss methods of clustering and optimal number of clusters.
We started from the simplest method of clustering - k-means method. We tried to cluster by total time spend in CNK and path length for one visitor.
At first we normed our data because time and length are in different scales.
library(cluster)
library(stringdist)
library(dplyr)
dane_klaster <- distinct(dane_czyste[,c(1,6,7)])
#norming data
dane_klaster$Time_total_norm <- scale(sqrt(as.numeric(dane_klaster$Time_total)))
dane_klaster$Path_length_norm <- scale(sqrt(dane_klaster$Path_length))
To find optimal number of clusters we calculated the distances within clusters, from 2 to 10.
library(cluster)
library(stringdist)
library(dplyr)
Kmax <- 10
WC <- sapply(2:Kmax, function(k) {
grupy <- kmeans(dane_klaster[,c(4,5)],
centers = k, nstart = 10)
sum(grupy$withinss)
})
Next we calculated the distances between clusters, from 2 to 10.
library(cluster)
library(stringdist)
library(dplyr)
Bmax <- 10
WB <- sapply(2:Bmax, function(k) {
grupy <- kmeans(dane_klaster[,c(4,5)],
centers = k, nstart = 10)
sum(grupy$betweenss)
})
Let’s compare our results:
library(ggplot2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
p1 <- ggplot(data.frame(K=factor(2:Kmax), WC), aes(K, WC)) +
geom_bar(stat="identity")+ylim(0,30000)+ggtitle("Distances within clusters")
p2 <- ggplot(data.frame(K=factor(2:Bmax), WB), aes(K, WB)) +
geom_bar(stat="identity")+ylim(0,30000)+ggtitle("Distances between clusters")
grid.arrange(p1,p2,ncol=2)
We can conclude that the optimal number of clusters is 4, because for numbers bigger than 4 we can observe slower decrease the distances within clusters and slower increase the distances between clusters.
For optimal number we have the plot of partition the data into clusters.
library(cluster)
library(stringdist)
library(dplyr)
grupy <- kmeans(dane_klaster[,c(4,5)],centers = 4, nstart = 10)
dane_klaster$Group_k <- factor(grupy$cluster)
ggplot(dane_klaster, aes(Time_total_norm, Path_length_norm, color=Group_k)) + geom_point(size=2) + theme_bw()+ggtitle("Division into clusters")
Next we wanted to take into consideration composition of our paths. We analyzed the diffrences between paths.
For got the matrix of distaneces between paths we used the Damerau-Levenshtein distance. In this metric a distance between two strings, given by counting the minimum number of operations needed to transform one string into the other, where an operation is defined as an insertion, deletion, or substitution of a single character, or a transposition of two adjacent characters.
The Damerau-Levenshtein distance differs from the classical Levenshtein distance by including transpositions among its allowable operations. The classical Levenshtein distance only allows insertion, deletion, and substitution operations.
library(stringdist)
distances <- stringdistmatrix(dane_sciezki$Path, method="dl")
distances <- as.matrix(distances)
distances <- as.data.frame(distances)
In this case we used the CLARA function, because CLARA is focused on clustering large datasets
load("distances.rda")
library(cluster)
klastry <- clara(distances, k=4)
We presented division on clusters on the following plot:
For better understanding the dataset we looked for some common rules which describe visited stations. Our first step was creating the matrix of transactions, which contains the information about stations visited by each visitor.
library(arules)
library(arulesViz)
library(ggplot2)
toApriopri <- dane_czyste[,c(2,1)]
toApriopri <-t(table(toApriopri))
toApriopri <- toApriopri[, colSums(toApriopri!=0) > 0]
toApriopri <- as.data.frame(toApriopri)
toApriopri <- toApriopri >= 1
trans <- as(toApriopri, "transactions")
As a result we obtained sparse matrix
library(arules)
library(arulesViz)
trans@data[1:10,1:50] #sprawdzenie
## 10 x 50 sparse Matrix of class "ngCMatrix"
##
## [1,] . . . . . . . . . . . . | . . . . . | | . . . . . | . . . | . . . .
## [2,] . . . . . | . . . . . . . . . . . . . . . . . . | | . . . . . . . .
## [3,] . . | . . | . . . . . . . . . . . . | | . . | . . . | . | . . . | .
## [4,] . . . . . . | . . | | | . . . . . . | . . . . . | . . . | . . | | |
## [5,] . . . . . . . . . . . . . . | . . . . . . . . . . . . . . . . . . .
## [6,] | . | | . . . . . . . . . . . . . . | . . . . . . | . . . . . | . .
## [7,] | . . . . . . | . . | . . . . . . . | . . . . . . . . . | . | | | .
## [8,] . . . . . . . . . . . . . . . . . . . . . . . . . | . . . . . . . .
## [9,] . | . . . . | . | . | . . . . . . . . | | . | . . | . . | | . | . |
## [10,] . . . . . . . . . . | . . . . . . . | . . . . . | . . . . . . | | .
##
## [1,] . . . | | . . . . . | | . . | |
## [2,] . . . . . | . . . . . . . . . .
## [3,] . . . | . . . . . | . . | | | .
## [4,] | . . | | | . . . . . . . . | .
## [5,] | . . . | | . . . . . . . . | .
## [6,] . . . . | | | | . . | . | | | .
## [7,] | . . | | | . . . . | . | | | .
## [8,] . . . . . . . . . . . . . . . .
## [9,] . . . | . | . . . . | . . . . .
## [10,] | . . | . . | . . | | . . | . .
image(head(trans,300)) #rzadka macierz
We were looking for rules.
library(arules)
library(arulesViz)
rules <- apriori(trans, parameter = list(support = .02))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport support minlen maxlen
## 0.8 0.1 1 none FALSE TRUE 0.02 1 10
## target ext
## rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 294
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[59 item(s), 14731 transaction(s)] done [0.00s].
## sorting and recoding items ... [59 item(s)] done [0.02s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 done [0.15s].
## writing ... [93 rule(s)] done [0.00s].
## creating S4 object ... done [0.02s].
smaller_rules<-subset(rules,subset=rhs %in% c("cnk12","cnk16"))
rules
## set of 93 rules
library(arules)
library(arulesViz)
library(ggplot2)
itemFrequencyPlot(trans, topN =20)
reguly <- inspect(rules)
## lhs rhs support confidence
## 1 {cnk19b} => {cnk19a} 0.09395153 0.9808646
## 2 {cnk06,cnk19b} => {cnk19a} 0.02321635 0.9913043
## 3 {cnk19b,cnk23} => {cnk19a} 0.02009368 0.9833887
## 4 {cnk19b,cnk61} => {cnk19a} 0.02138348 0.9843750
## 5 {cnk19b,cnk67} => {cnk19a} 0.02097617 0.9840764
## 6 {cnk19b,cnk39} => {cnk19a} 0.02090829 0.9777778
## 7 {cnk11,cnk19b} => {cnk19a} 0.02688209 0.9729730
## 8 {cnk02b,cnk19b} => {cnk19a} 0.02599959 0.9820513
## 9 {cnk19b,cnk21} => {cnk19a} 0.02138348 0.9843750
## 10 {cnk19b,cnk69} => {cnk19a} 0.02179078 0.9876923
## 11 {cnk19b,cnk20} => {cnk19a} 0.02389519 0.9643836
## 12 {cnk07,cnk19b} => {cnk19a} 0.02457403 0.9863760
## 13 {cnk02a,cnk19b} => {cnk19a} 0.03197339 0.9731405
## 14 {cnk05,cnk19b} => {cnk19a} 0.02701785 0.9827160
## 15 {cnk17,cnk19b} => {cnk19a} 0.02661055 0.9775561
## 16 {cnk19b,cnk66} => {cnk19a} 0.02538864 0.9739583
## 17 {cnk10,cnk19b} => {cnk19a} 0.02620324 0.9846939
## 18 {cnk09,cnk19b} => {cnk19a} 0.02681420 0.9729064
## 19 {cnk19b,cnk78a} => {cnk19a} 0.02986898 0.9843400
## 20 {cnk18,cnk19b} => {cnk19a} 0.02919014 0.9862385
## 21 {cnk12,cnk19b} => {cnk19a} 0.03149820 0.9727463
## 22 {cnk16,cnk19b} => {cnk19a} 0.04161293 0.9745628
## 23 {cnk16,cnk20,cnk21,cnk67} => {cnk12} 0.02043310 0.8005319
## 24 {cnk05,cnk16,cnk21,cnk67} => {cnk12} 0.02097617 0.8005181
## 25 {cnk09,cnk21,cnk66,cnk67} => {cnk12} 0.02016156 0.8250000
## 26 {cnk16,cnk21,cnk66,cnk67} => {cnk12} 0.02199443 0.8000000
## 27 {cnk09,cnk16,cnk21,cnk67} => {cnk12} 0.02240174 0.8168317
## 28 {cnk09,cnk20,cnk66,cnk67} => {cnk12} 0.02056887 0.8681948
## 29 {cnk09,cnk16,cnk20,cnk67} => {cnk12} 0.02213020 0.8190955
## 30 {cnk09,cnk16,cnk20,cnk72} => {cnk12} 0.02016156 0.8005391
## 31 {cnk11,cnk20,cnk21,cnk66} => {cnk12} 0.02002580 0.8060109
## 32 {cnk09,cnk11,cnk20,cnk21} => {cnk12} 0.02084040 0.8143236
## 33 {cnk11,cnk16,cnk20,cnk21} => {cnk12} 0.02308058 0.8018868
## 34 {cnk07,cnk11,cnk16,cnk21} => {cnk12} 0.02199443 0.8000000
## 35 {cnk05,cnk11,cnk16,cnk21} => {cnk12} 0.02090829 0.8083990
## 36 {cnk09,cnk11,cnk21,cnk66} => {cnk12} 0.02090829 0.8062827
## 37 {cnk09,cnk11,cnk20,cnk66} => {cnk12} 0.02117982 0.8432432
## 38 {cnk07,cnk11,cnk12,cnk66} => {cnk16} 0.02301269 0.8149038
## 39 {cnk11,cnk12,cnk17,cnk66} => {cnk16} 0.02301269 0.8033175
## 40 {cnk05,cnk07,cnk20,cnk21} => {cnk12} 0.02043310 0.8246575
## 41 {cnk07,cnk17,cnk20,cnk21} => {cnk12} 0.02226597 0.8220551
## 42 {cnk07,cnk20,cnk21,cnk66} => {cnk12} 0.02206232 0.8145363
## 43 {cnk07,cnk10,cnk20,cnk21} => {cnk12} 0.02179078 0.8065327
## 44 {cnk07,cnk09,cnk20,cnk21} => {cnk12} 0.02375942 0.8454106
## 45 {cnk07,cnk16,cnk20,cnk21} => {cnk12} 0.02599959 0.8080169
## 46 {cnk05,cnk17,cnk20,cnk21} => {cnk12} 0.02117982 0.8342246
## 47 {cnk05,cnk20,cnk21,cnk66} => {cnk12} 0.02240174 0.8250000
## 48 {cnk05,cnk10,cnk20,cnk21} => {cnk12} 0.02280904 0.8296296
## 49 {cnk05,cnk09,cnk20,cnk21} => {cnk12} 0.02240174 0.8418367
## 50 {cnk05,cnk16,cnk20,cnk21} => {cnk12} 0.02538864 0.8077754
## 51 {cnk17,cnk20,cnk21,cnk66} => {cnk12} 0.02457403 0.8302752
## 52 {cnk10,cnk17,cnk20,cnk21} => {cnk12} 0.02280904 0.8215159
## 53 {cnk09,cnk17,cnk20,cnk21} => {cnk12} 0.02491345 0.8475751
## 54 {cnk16,cnk17,cnk20,cnk21} => {cnk12} 0.02735727 0.8108652
## 55 {cnk10,cnk20,cnk21,cnk66} => {cnk12} 0.02335211 0.8000000
## 56 {cnk09,cnk20,cnk21,cnk66} => {cnk12} 0.02661055 0.8448276
## 57 {cnk20,cnk21,cnk66,cnk78a} => {cnk12} 0.02029733 0.8081081
## 58 {cnk09,cnk10,cnk20,cnk21} => {cnk12} 0.02457403 0.8246014
## 59 {cnk09,cnk20,cnk21,cnk78a} => {cnk12} 0.02043310 0.8224044
## 60 {cnk09,cnk18,cnk20,cnk21} => {cnk12} 0.02056887 0.8015873
## 61 {cnk09,cnk16,cnk20,cnk21} => {cnk12} 0.02973322 0.8081181
## 62 {cnk05,cnk07,cnk17,cnk21} => {cnk12} 0.02029733 0.8037634
## 63 {cnk05,cnk07,cnk10,cnk21} => {cnk12} 0.02104406 0.8031088
## 64 {cnk05,cnk07,cnk09,cnk21} => {cnk12} 0.02165501 0.8035264
## 65 {cnk07,cnk17,cnk21,cnk66} => {cnk12} 0.02260539 0.8024096
## 66 {cnk07,cnk09,cnk21,cnk78a} => {cnk12} 0.02009368 0.8131868
## 67 {cnk05,cnk17,cnk21,cnk66} => {cnk12} 0.02172290 0.8080808
## 68 {cnk05,cnk09,cnk17,cnk21} => {cnk12} 0.02233385 0.8004866
## 69 {cnk05,cnk09,cnk21,cnk66} => {cnk12} 0.02369153 0.8097448
## 70 {cnk05,cnk09,cnk10,cnk21} => {cnk12} 0.02389519 0.8018223
## 71 {cnk09,cnk21,cnk66,cnk78a} => {cnk12} 0.02104406 0.8010336
## 72 {cnk05,cnk07,cnk20,cnk66} => {cnk12} 0.02104406 0.8072917
## 73 {cnk07,cnk17,cnk20,cnk66} => {cnk12} 0.02396307 0.8286385
## 74 {cnk07,cnk09,cnk17,cnk20} => {cnk12} 0.02321635 0.8028169
## 75 {cnk07,cnk09,cnk20,cnk66} => {cnk12} 0.02409884 0.8352941
## 76 {cnk07,cnk16,cnk20,cnk66} => {cnk12} 0.02735727 0.8027888
## 77 {cnk07,cnk09,cnk16,cnk20} => {cnk12} 0.02776458 0.8019608
## 78 {cnk05,cnk17,cnk20,cnk66} => {cnk12} 0.02274116 0.8271605
## 79 {cnk05,cnk10,cnk20,cnk66} => {cnk12} 0.02437038 0.8013393
## 80 {cnk05,cnk09,cnk20,cnk66} => {cnk12} 0.02403096 0.8329412
## 81 {cnk10,cnk17,cnk20,cnk66} => {cnk12} 0.02470980 0.8070953
## 82 {cnk09,cnk17,cnk20,cnk66} => {cnk12} 0.02633901 0.8416486
## 83 {cnk17,cnk20,cnk66,cnk78a} => {cnk12} 0.02063675 0.8328767
## 84 {cnk09,cnk10,cnk20,cnk66} => {cnk12} 0.02559229 0.8160173
## 85 {cnk09,cnk20,cnk66,cnk78a} => {cnk12} 0.02280904 0.8421053
## 86 {cnk09,cnk18,cnk20,cnk66} => {cnk12} 0.02192655 0.8136020
## 87 {cnk09,cnk16,cnk20,cnk66} => {cnk12} 0.03136243 0.8105263
## 88 {cnk16,cnk20,cnk66,cnk78a} => {cnk12} 0.02593171 0.8042105
## 89 {cnk09,cnk16,cnk20,cnk21,cnk66} => {cnk12} 0.02090829 0.8555556
## 90 {cnk09,cnk16,cnk17,cnk21,cnk66} => {cnk12} 0.02043310 0.8179348
## 91 {cnk09,cnk16,cnk17,cnk20,cnk66} => {cnk12} 0.02104406 0.8587258
## 92 {cnk09,cnk10,cnk12,cnk20,cnk66} => {cnk16} 0.02056887 0.8037135
## 93 {cnk09,cnk10,cnk16,cnk20,cnk66} => {cnk12} 0.02056887 0.8416667
## lift
## 1 7.636954
## 2 7.718237
## 3 7.656606
## 4 7.664285
## 5 7.661961
## 6 7.612920
## 7 7.575510
## 8 7.646193
## 9 7.664285
## 10 7.690114
## 11 7.508633
## 12 7.679865
## 13 7.576814
## 14 7.651369
## 15 7.611194
## 16 7.583182
## 17 7.666768
## 18 7.574992
## 19 7.664013
## 20 7.678795
## 21 7.573745
## 22 7.587888
## 23 2.339345
## 24 2.339304
## 25 2.410846
## 26 2.337790
## 27 2.386976
## 28 2.537072
## 29 2.393592
## 30 2.339365
## 31 2.355355
## 32 2.379647
## 33 2.343304
## 34 2.337790
## 35 2.362334
## 36 2.356150
## 37 2.464157
## 38 2.081920
## 39 2.052319
## 40 2.409845
## 41 2.402240
## 42 2.380269
## 43 2.356880
## 44 2.470491
## 45 2.361217
## 46 2.437803
## 47 2.410846
## 48 2.424375
## 49 2.460047
## 50 2.360512
## 51 2.426262
## 52 2.400665
## 53 2.476816
## 54 2.369541
## 55 2.337790
## 56 2.468787
## 57 2.361484
## 58 2.409681
## 59 2.403261
## 60 2.342429
## 61 2.361513
## 62 2.348788
## 63 2.346875
## 64 2.348095
## 65 2.344832
## 66 2.376325
## 67 2.361404
## 68 2.339212
## 69 2.366267
## 70 2.343115
## 71 2.340811
## 72 2.359098
## 73 2.421479
## 74 2.346022
## 75 2.440928
## 76 2.345940
## 77 2.343520
## 78 2.417160
## 79 2.341704
## 80 2.434052
## 81 2.358524
## 82 2.459497
## 83 2.433864
## 84 2.384597
## 85 2.460832
## 86 2.377538
## 87 2.368551
## 88 2.350094
## 89 2.500137
## 90 2.390200
## 91 2.509401
## 92 2.053331
## 93 2.459550
reguly$n <- row(reguly)[,1]
ggplot(reguly, aes(x=support, y=confidence, label=n, color=lift))+geom_point(size=3) +geom_text(aes(label=n),hjust=2, vjust=2)+ggtitle("Scatter plot for 93 rules")
As we can see, rule 1 has large support and confidence, but we found it as not interserting, because on the Right Hand Side it contains only the entering of station cnk19 and on the Left Hand Side it contains the exit of station cnk19. Next 21 rules are also not interesting, because they look like rule 1 with added one more station. So taking this into consideration, we dropped first 22 rules.
library(ggplot2)
library(arules)
ggplot(reguly[-c(1:22),], aes(x=support, y=confidence, label=n, color=lift))+geom_point(size=3) +geom_text(aes(label=n),hjust=1.5, vjust=1.5)+ggtitle("Scatter plot for 71 rules")
We found rules 56 and 82 interesting.
library(arules)
reguly[c(56,82),]
## lhs rhs support confidence lift n
## 56 {cnk09,cnk20,cnk21,cnk66} => {cnk12} 0.02661055 0.8448276 2.468787 56
## 82 {cnk09,cnk17,cnk20,cnk66} => {cnk12} 0.02633901 0.8416486 2.459497 82
To better understand this rules, we presented it on the map of CNK. First, let’s look at the full map:
Map of the CNK
Now, we zoom in our map to show rules number 56 and 82.
zoomed map
Next we used library arulesViz to present our rules.
1.Graph plot: Used to visualize association rules using vertices and edges where vertices typically represent items or item-sets and edges indicate relationship in rules. First there is a graph for all 93 the rules and the second one is for 71 rules.
library(arules)
smaller_rules<-subset(rules,subset=rhs %in% c("cnk12","cnk16")) #<- uci?ta klasa regul do nr 22-93
plot(head(sort(rules, by="lift"), 93), method="graph", control=list(cex=.7))
plot(head(sort(smaller_rules, by="lift"), 71), method="graph", control=list(cex=.7))
We see that the rule {cnk19b} => {cnk19a} is most common one. Our rules divide into two groups. At the second graph we get rules without first 22 rules, and now we have only one group with longer rules, but they have smaller lift than the first 22 rules.
2.Grouped Matrix: It is similar to matrix based plot. Here rules are grouped to present as an aggregate in the matrix.
library(ggplot2)
library(arules)
plot(head(sort(rules, by="lift"), 93), method="grouped", control=list(cex=.7))
## Available parameter (with default values):
## main = Grouped matrix for 93 rules
## k = 20
## aggr.fun = function (x, na.rm = FALSE) UseMethod("median")
## col = c("#D33F6A", "#D34269", "#D44468", "#D54667", "#D54866", "#D64A65", "#D74C63", "#D74E62", "#D85061", "#D95260", "#D9545F", "#DA565E", "#DB585D", "#DB5A5B", "#DC5C5A", "#DC5E59", "#DD6058", "#DD6256", "#DE6355", "#DF6554", "#DF6753", "#E06951", "#E06B50", "#E16D4F", "#E16E4D", "#E2704C", "#E2724B", "#E27449", "#E37548", "#E37747", "#E47945", "#E47B44", "#E47C42", "#E57E41", "#E58040", "#E6823E", "#E6833D", "#E6853B", "#E6873A", "#E78939", "#E78A37", "#E78C36", "#E88E35", "#E88F33", "#E89132", "#E89331", "#E89430", "#E9962E", "#E9982D", "#E99A2C", "#E99B2B", "#E99D2B", "#E99F2A", "#E9A029", "#E9A229", "#EAA428", "#EAA528", "#EAA728", "#EAA928", "#EAAA28", "#EAAC28", "#EAAE29", "#EAAF29", "#EAB12A", "#EAB32B", "#E9B42C", "#E9B62D", "#E9B82F", "#E9B930", "#E9BB32", "#E9BC33", "#E9BE35", "#E9C037", "#E8C139", "#E8C33B", "#E8C53D", "#E8C640", "#E8C842", "#E7C944", "#E7CB47", "#E7CD4A", "#E7CE4C", "#E6D04F", "#E6D152", "#E6D355", "#E5D458", "#E5D65B", "#E5D85E", "#E5D961", "#E4DB64", "#E4DC68", "#E4DE6C", "#E3DF6F", "#E3E173", "#E3E278", "#E2E37D", "#E2E582", "#E2E688", "#E2E791", "#E2E6BD")
## reverse = TRUE
## xlab = NULL
## ylab = NULL
## legend = size: support color: lift
## panel.function = function (row, size, shading, spacing) { size[size == 0] <- NA shading[is.na(shading)] <- 1 grid.circle(x = c(1:length(size)), y = row, r = size/2 * (1 - spacing), default.units = "native", gp = gpar(fill = shading, alpha = 0.9)) }
## spacing = -1
## newpage = TRUE
## gp_labels = list(cex = 0.8)
## gp_panels = list()
## interactive = FALSE
## max.shading = NA
## verbose = FALSE
plot(head(sort(smaller_rules, by="lift"), 71), method="grouped", control=list(cex=.7))
## Available parameter (with default values):
## main = Grouped matrix for 71 rules
## k = 20
## aggr.fun = function (x, na.rm = FALSE) UseMethod("median")
## col = c("#D33F6A", "#D34269", "#D44468", "#D54667", "#D54866", "#D64A65", "#D74C63", "#D74E62", "#D85061", "#D95260", "#D9545F", "#DA565E", "#DB585D", "#DB5A5B", "#DC5C5A", "#DC5E59", "#DD6058", "#DD6256", "#DE6355", "#DF6554", "#DF6753", "#E06951", "#E06B50", "#E16D4F", "#E16E4D", "#E2704C", "#E2724B", "#E27449", "#E37548", "#E37747", "#E47945", "#E47B44", "#E47C42", "#E57E41", "#E58040", "#E6823E", "#E6833D", "#E6853B", "#E6873A", "#E78939", "#E78A37", "#E78C36", "#E88E35", "#E88F33", "#E89132", "#E89331", "#E89430", "#E9962E", "#E9982D", "#E99A2C", "#E99B2B", "#E99D2B", "#E99F2A", "#E9A029", "#E9A229", "#EAA428", "#EAA528", "#EAA728", "#EAA928", "#EAAA28", "#EAAC28", "#EAAE29", "#EAAF29", "#EAB12A", "#EAB32B", "#E9B42C", "#E9B62D", "#E9B82F", "#E9B930", "#E9BB32", "#E9BC33", "#E9BE35", "#E9C037", "#E8C139", "#E8C33B", "#E8C53D", "#E8C640", "#E8C842", "#E7C944", "#E7CB47", "#E7CD4A", "#E7CE4C", "#E6D04F", "#E6D152", "#E6D355", "#E5D458", "#E5D65B", "#E5D85E", "#E5D961", "#E4DB64", "#E4DC68", "#E4DE6C", "#E3DF6F", "#E3E173", "#E3E278", "#E2E37D", "#E2E582", "#E2E688", "#E2E791", "#E2E6BD")
## reverse = TRUE
## xlab = NULL
## ylab = NULL
## legend = size: support color: lift
## panel.function = function (row, size, shading, spacing) { size[size == 0] <- NA shading[is.na(shading)] <- 1 grid.circle(x = c(1:length(size)), y = row, r = size/2 * (1 - spacing), default.units = "native", gp = gpar(fill = shading, alpha = 0.9)) }
## spacing = -1
## newpage = TRUE
## gp_labels = list(cex = 0.8)
## gp_panels = list()
## interactive = FALSE
## max.shading = NA
## verbose = FALSE
At the end we wanted to present some special object - Markov Chain.
Initially we prepared data in order to get the number of passing between each two exhibits.
load("tab.rda")
colnames(tab)[1] <- "from"
colnames(tab)[2] <- "to"
head(tab)
## from to Freq
## 1 cnk02a cnk02a 0
## 2 cnk02b cnk02a 519
## 3 cnk03 cnk02a 32
## 4 cnk05 cnk02a 127
## 5 cnk06 cnk02a 161
## 6 cnk07 cnk02a 54
Then we wanted to show all the stations on the graph, but it doesn’t look good - there is too much passing and arrows. This makes this graph unintelligible. Let’s see it:
library(reshape2)
library(igraph)
library(network)
library(sna)
library(qgraph)
library(ndtv)
library(dplyr)
razem <- tab
links <- tab
t1 <- razem[!duplicated(razem$station),]
nodes <- t1
colnames(tab)[1] <- "from"
colnames(tab)[2] <- "to"
suma<- aggregate(Freq~from,tab,sum)
tab <- left_join(tab,suma, by="from")
colnames(tab)[3]<- "prob"
tab$prob <- tab$prob/tab$Freq.y
qgraph(tab[,-4], edge.labels = TRUE)
That is why we decided to present only graphs for two rules:
tab_reguly<-subset(tab,(tab$from=="cnk09"|tab$from=="cnk12"|tab$from=="cnk17"|tab$from=="cnk20"|tab$from=="cnk21"|tab$from=="cnk66")&(tab$to=="cnk09"|tab$to=="cnk12"|tab$to=="cnk17"|tab$to=="cnk20"|tab$to=="cnk21"|tab$to=="cnk66"))
tab_reguly<-subset(tab_reguly[,-4],tab_reguly$from!=tab_reguly$to)
library(qgraph)
qgraph(tab_reguly, edge.labels = TRUE)